MLS Salary Reproducible Analysis

Sample analysis

Tom Worville


In [1]:
import os
import glob
import pandas as pd

In [2]:
URL = 'https://github.com/data-is-plural/mls-salaries/raw/master/csvs/mls-salaries-'

In [3]:
years = range(2007,2017)

In [4]:
from urllib import urlretrieve
for year in years:
    year = str(year)
    urlretrieve((URL + str(year) + ".csv"), (year + '_salaries.csv'))

In [5]:
path = os.getcwd()
all_files = glob.glob(os.path.join(path, "*.csv"))

In [6]:
data = pd.DataFrame()
for csv in all_files:
    for year in years:
        if str(year) in str(csv):
            print year
            season = pd.read_csv(csv)
            season['season'] = year
            season['player'] = season['first_name'] + " " + season['last_name']
            data = pd.concat([data, season])


2007
2008
2009
2010
2011
2012
2013
2014
2015
2016

In [7]:
data.head()


Out[7]:
club last_name first_name position base_salary guaranteed_compensation season player
0 CHI Armas Chris M 225000.0 225000.0 2007 Chris Armas
1 CHI Banner Michael M 12900.0 12900.0 2007 Michael Banner
2 CHI Barrett Chad F 41212.5 48712.5 2007 Chad Barrett
3 CHI Blanco Cuauhtemoc F 2492316.0 2666778.0 2007 Cuauhtemoc Blanco
4 CHI Brown C.J. D 106391.0 106391.0 2007 C.J. Brown

In [8]:
%matplotlib inline

In [9]:
import matplotlib.pyplot as plt
plt.style.use('ggplot')

In [10]:
data.groupby(data.season).median().plot(legend = False);



In [11]:
pivoted = data.pivot_table('base_salary',index = data.season, columns = data.player, aggfunc='sum')
pivoted.iloc[:5, :5]


Out[11]:
player AJ Cochran AJ DeLaGarza AJ Soares Aaron Guillen Aaron Hohlbein
season
2007 NaN NaN NaN NaN 30000.0
2008 NaN NaN NaN NaN 33000.0
2009 NaN 36000.0 NaN NaN 34650.0
2010 NaN 45100.0 NaN NaN 40000.0
2011 NaN 55100.0 42000.0 NaN NaN

In [12]:
ax = pivoted.plot(legend = False, alpha = 0.05);
ax.set_ylim(0, 500000);



In [ ]: